Data Preprocessing

StateOfInterest = c("Arizona", "California", "Minnesota", "New Mexico", "New York", 
                   "Oklahoma", "South Carolina", "Tennessee", "Utah", "Virginia", 
                   "West Virginia", "Wisconsin")

Demographic & Policy Data (County-Level)

library(readr)
county_data_abridged = read_csv("county_data_abridged.csv")
dim(county_data_abridged)
## [1] 3244   87
names(county_data_abridged)
##  [1] "countyFIPS"                        "STATEFP"                          
##  [3] "COUNTYFP"                          "CountyName"                       
##  [5] "StateName"                         "State"                            
##  [7] "lat"                               "lon"                              
##  [9] "POP_LATITUDE"                      "POP_LONGITUDE"                    
## [11] "CensusRegionName"                  "CensusDivisionName"               
## [13] "Rural-UrbanContinuumCode2013"      "PopulationEstimate2018"           
## [15] "PopTotalMale2017"                  "PopTotalFemale2017"               
## [17] "FracMale2017"                      "PopulationEstimate65+2017"        
## [19] "PopulationDensityperSqMile2010"    "CensusPopulation2010"             
## [21] "MedianAge2010"                     "#EligibleforMedicare2018"         
## [23] "MedicareEnrollment,AgedTot2017"    "3-YrDiabetes2015-17"              
## [25] "DiabetesPercentage"                "HeartDiseaseMortality"            
## [27] "StrokeMortality"                   "Smokers_Percentage"               
## [29] "RespMortalityRate2014"             "#FTEHospitalTotal2017"            
## [31] "TotalM.D.'s,TotNon-FedandFed2017"  "#HospParticipatinginNetwork2017"  
## [33] "#Hospitals"                        "#ICU_beds"                        
## [35] "dem_to_rep_ratio"                  "PopMale<52010"                    
## [37] "PopFmle<52010"                     "PopMale5-92010"                   
## [39] "PopFmle5-92010"                    "PopMale10-142010"                 
## [41] "PopFmle10-142010"                  "PopMale15-192010"                 
## [43] "PopFmle15-192010"                  "PopMale20-242010"                 
## [45] "PopFmle20-242010"                  "PopMale25-292010"                 
## [47] "PopFmle25-292010"                  "PopMale30-342010"                 
## [49] "PopFmle30-342010"                  "PopMale35-442010"                 
## [51] "PopFmle35-442010"                  "PopMale45-542010"                 
## [53] "PopFmle45-542010"                  "PopMale55-592010"                 
## [55] "PopFmle55-592010"                  "PopMale60-642010"                 
## [57] "PopFmle60-642010"                  "PopMale65-742010"                 
## [59] "PopFmle65-742010"                  "PopMale75-842010"                 
## [61] "PopFmle75-842010"                  "PopMale>842010"                   
## [63] "PopFmle>842010"                    "3-YrMortalityAge<1Year2015-17"    
## [65] "3-YrMortalityAge1-4Years2015-17"   "3-YrMortalityAge5-14Years2015-17" 
## [67] "3-YrMortalityAge15-24Years2015-17" "3-YrMortalityAge25-34Years2015-17"
## [69] "3-YrMortalityAge35-44Years2015-17" "3-YrMortalityAge45-54Years2015-17"
## [71] "3-YrMortalityAge55-64Years2015-17" "3-YrMortalityAge65-74Years2015-17"
## [73] "3-YrMortalityAge75-84Years2015-17" "3-YrMortalityAge85+Years2015-17"  
## [75] "mortality2015-17Estimated"         "stay at home"                     
## [77] ">50 gatherings"                    ">500 gatherings"                  
## [79] "public schools"                    "restaurant dine-in"               
## [81] "entertainment/gym"                 "federal guidelines"               
## [83] "foreign travel ban"                "SVIPercentile"                    
## [85] "HPSAShortage"                      "HPSAServedPop"                    
## [87] "HPSAUnderservedPop"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#EligibleforMedicare2018"] = "EligibleforMedicare2018"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#FTEHospitalTotal2017"] = "FTEHospitalTotal2017"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#HospParticipatinginNetwork2017"] = "HospParticipatinginNetwork2017"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#Hospitals"] = "Hospitals"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "#ICU_beds"] = "ICU_beds"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "PopulationEstimate65+2017"] = "PopulationEstimate_above65_2017"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "stay at home"] = "stay_at_home"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == ">50 gatherings"] = "above_50_gatherings"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == ">500 gatherings"] = "above_500_gatherings"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "public schools"] = "public_schools"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "restaurant dine-in"] = "restaurant_dine_in"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "entertainment/gym"] = "entertainment_gym"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "federal guidelines"] = "federal_guidelines"
colnames(county_data_abridged)[colnames(county_data_abridged) 
                               == "foreign travel ban"] = "foreign_travel_ban"
data = subset(county_data_abridged,
              select = c(State, CountyName, POP_LATITUDE, POP_LONGITUDE,
                         PopulationEstimate2018, PopTotalMale2017, 
                         PopulationEstimate_above65_2017, PopulationDensityperSqMile2010, 
                         DiabetesPercentage, Smokers_Percentage, 
                         HeartDiseaseMortality, StrokeMortality, 
                         Hospitals, ICU_beds, HospParticipatinginNetwork2017, 
                         stay_at_home, above_50_gatherings, above_500_gatherings, 
                         restaurant_dine_in, entertainment_gym))
data = na.omit(data)
data = droplevels(data)

data$stay_at_home = data$stay_at_home - range(data$stay_at_home)[1]
data$above_50_gatherings = data$above_50_gatherings - range(data$above_50_gatherings)[1]
data$above_500_gatherings = data$above_500_gatherings - range(data$above_500_gatherings)[1]
data$restaurant_dine_in = data$restaurant_dine_in - range(data$restaurant_dine_in)[1]
data$entertainment_gym = data$entertainment_gym - range(data$entertainment_gym)[1]

str(data)
## tibble [2,585 × 20] (S3: tbl_df/tbl/data.frame)
##  $ State                          : chr [1:2585] "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ CountyName                     : chr [1:2585] "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ POP_LATITUDE                   : num [1:2585] 32.5 30.5 31.8 33 34 ...
##  $ POP_LONGITUDE                  : num [1:2585] -86.5 -87.8 -85.3 -87.1 -86.6 ...
##  $ PopulationEstimate2018         : num [1:2585] 55601 218022 24881 22400 57840 ...
##  $ PopTotalMale2017               : num [1:2585] 27007 103225 13335 12138 28607 ...
##  $ PopulationEstimate_above65_2017: num [1:2585] 8392 42413 4757 3632 10351 ...
##  $ PopulationDensityperSqMile2010 : num [1:2585] 91.8 114.7 31 36.8 88.9 ...
##  $ DiabetesPercentage             : num [1:2585] 9.9 8.5 15.7 13.3 14.9 22.4 16.9 15.6 17.5 12.2 ...
##  $ Smokers_Percentage             : num [1:2585] 18.1 17.5 22 19.1 19.2 ...
##  $ HeartDiseaseMortality          : num [1:2585] 204 183 220 226 225 ...
##  $ StrokeMortality                : num [1:2585] 56.1 41.9 49 57.2 52.8 54.1 59 44 45.2 47.3 ...
##  $ Hospitals                      : num [1:2585] 1 3 1 1 1 1 1 2 0 1 ...
##  $ ICU_beds                       : num [1:2585] 6 51 5 0 6 0 7 24 0 0 ...
##  $ HospParticipatinginNetwork2017 : num [1:2585] 0 0 0 0 1 0 0 0 0 0 ...
##  $ stay_at_home                   : num [1:2585] 16 16 16 16 16 16 16 16 16 16 ...
##  $ above_50_gatherings            : num [1:2585] 5 5 5 5 5 5 5 5 5 5 ...
##  $ above_500_gatherings           : num [1:2585] 2 2 2 2 2 2 2 2 2 2 ...
##  $ restaurant_dine_in             : num [1:2585] 7 7 7 7 7 7 7 7 7 7 ...
##  $ entertainment_gym              : num [1:2585] 16 16 16 16 16 16 16 16 16 16 ...
##  - attr(*, "na.action")= 'omit' Named int [1:659] 68 69 70 71 72 73 74 75 76 77 ...
##   ..- attr(*, "names")= chr [1:659] "68" "69" "70" "71" ...
data_demographic_county = data

Infection Data (State-Level)

Covid-19 Cases

timeseries = read_csv("../datasets/timeseries.csv")
data = timeseries
data = subset(data, country == "United States" & level == "state")
data = subset(data, !(name %in% c("Unassigned cases, Arkansas, US", 
              "Unassigned cases, Georgia, US", "Unassigned cases, Illinois, US",
              "Unassigned cases, Iowa, US", "Unassigned cases, Maine, US",
              "Unassigned cases, Massachusetts, US", "Unassigned cases, North Dakota, US",
              "Washington, D.C., US")))
data$state = matrix(unlist(strsplit(as.character(data$name), ", ")), ncol = 2, byrow = TRUE)[, 1]
data = subset(data,
              select = c(state, date, cases, deaths, recovered))

data = subset(data, state %in% StateOfInterest)

data$state =     as.factor(data$state)
data$cases =     as.numeric(data$cases)
data$deaths =    as.numeric(data$deaths)
data$recovered = as.numeric(data$recovered)

data = na.omit(data)
data = droplevels(data)

str(data)
## tibble [1,578 × 5] (S3: tbl_df/tbl/data.frame)
##  $ state    : Factor w/ 12 levels "Arizona","California",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date     : Date[1:1578], format: "2020-04-14" "2020-04-15" ...
##  $ cases    : num [1:1578] 3806 3962 4234 4507 4719 ...
##  $ deaths   : num [1:1578] 131 142 150 169 177 184 187 208 229 249 ...
##  $ recovered: num [1:1578] 249 385 460 539 539 ...
##  - attr(*, "problems")= tibble [1,666,558 × 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:1666558] 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 ...
##   ..$ col     : chr [1:1666558] "state" "state" "state" "state" ...
##   ..$ expected: chr [1:1666558] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
##   ..$ actual  : chr [1:1666558] "Burgenland" "Burgenland" "Burgenland" "Burgenland" ...
##   ..$ file    : chr [1:1666558] "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" "'../datasets/timeseries.csv'" ...
##  - attr(*, "na.action")= 'omit' Named int [1:894] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:894] "1" "2" "3" "4" ...
range(data$cases)
## [1]    237 613076
range(data$deaths)
## [1]     1 25250
range(data$recovered)
## [1]      2 266287
data_cases_state = data

SEIR Model Parameters

model_out = read_csv("../datasets/model_out.csv")
str(model_out)
## tibble [73 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ state    : chr [1:73] "Arizona" "Arizona" "Arizona" "Arizona" ...
##  $ startdate: Date[1:73], format: "2020-04-16" "2020-05-01" ...
##  $ enddate  : Date[1:73], format: "2020-04-30" "2020-05-15" ...
##  $ k        : num [1:73] 0.783 0.863 0.865 0.949 0.986 ...
##  $ sigma    : num [1:73] 0.0714 0.0714 0.0714 0.0714 0.0714 ...
##  $ lamda    : num [1:73] 0.0563 0.07 0.1266 0.0784 0.0925 ...
##  $ c        : num [1:73] 1 0 0.127 1 0 ...
##  $ alpha    : num [1:73] 0.06825 0.00721 0.06765 0.10224 0.05641 ...
##  $ omega    : num [1:73] 0.00275 0.003091 0.001429 0.001371 0.000698 ...
##  $ miu      : num [1:73] 0.0384 0.0175 0.0338 0.036 0.014 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   state = col_character(),
##   ..   startdate = col_date(format = ""),
##   ..   enddate = col_date(format = ""),
##   ..   k = col_double(),
##   ..   sigma = col_double(),
##   ..   lamda = col_double(),
##   ..   c = col_double(),
##   ..   alpha = col_double(),
##   ..   omega = col_double(),
##   ..   miu = col_double()
##   .. )
data_parm = model_out

Combined Data of Intersts (State-Level)

data = data_parm

n = length(data$state)
data$cases = rep(0, n)
data$deaths = rep(0, n)
data$recovered = rep(0, n)

for (i in 1:n){
  j = data_cases_state$state == data$state[i] & data_cases_state$date == data$startdate[i]
  if (sum(j) == 0){
    data[i, ] = NA
  }else{
    data[i, c("cases", "deaths", "recovered")] = data_cases_state[j, c("cases", "deaths", "recovered")]
  }
}

data = na.omit(data)

data_demographic_state = as.data.frame(matrix(nrow = length(data$state), ncol = length(colnames(data_demographic_county)) - 1))
colnames(data_demographic_state) = colnames(data_demographic_county)[colnames(data_demographic_county) != "CountyName"]
data_demographic_state$State = data$state

for (s in data_demographic_state$State){
  for (k in 2:ncol(data_demographic_state)){
    data_demographic_state[data_demographic_state$State == s, k] = 
      mean(unlist(data_demographic_county[data_demographic_county$State == s, k+1]))
  }
}

data = cbind(data, 
             subset(data_demographic_state[, colnames(data_demographic_state)[colnames(data_demographic_state) != "State"]]))
data = na.omit(data)
data$days = rep(0, nrow(data))
data$temp = rep(0, nrow(data))
for (i in 1:nrow(data)){
  data$temp = diff.Date(c(data[i, ]$startdate, data[i, ]$enddate))
}
for (s in data$state){
  for (d in 1:nrow(subset(data, state == s))){
    data[data$state == s, "days"][d] = sum(subset(data, state == s)[1:d, "temp"])
  }
}
data = data[, colnames(data)[colnames(data) != "temp"]]
data_unamed = data[, colnames(data)[!(colnames(data) %in% c("state", "startdate", "enddate"))]]
# head(data_demographic_county, 10)
# head(data_demographic_state, 10)
# head(data[, c("state", "startdate", "enddate", "days")], 10)
head(data, 10)
##         state  startdate    enddate      k   sigma   lamda       c   alpha
## 1     Arizona 2020-04-16 2020-04-30 0.7834 0.07143 0.05633 1.00000 0.06825
## 2     Arizona 2020-05-01 2020-05-15 0.8631 0.07143 0.07001 0.00000 0.00721
## 3     Arizona 2020-05-16 2020-05-30 0.8653 0.07143 0.12657 0.12709 0.06765
## 4     Arizona 2020-06-01 2020-06-15 0.9492 0.07143 0.07839 1.00000 0.10224
## 5     Arizona 2020-06-16 2020-06-30 0.9864 0.07143 0.09253 0.00000 0.05641
## 6     Arizona 2020-07-01 2020-07-15 0.9732 0.07143 0.06093 0.00000 0.00000
## 7  California 2020-04-16 2020-04-30 0.8574 0.07143 0.07211 0.07752 0.02728
## 8  California 2020-05-01 2020-05-15 0.9481 0.07143 0.02832 1.00000 0.01925
## 9  California 2020-05-16 2020-05-30 0.9598 0.07143 0.04972 0.40540 0.03497
## 10 California 2020-06-01 2020-06-15 1.1270 0.07143 0.10206 0.01176 0.03073
##        omega      miu  cases deaths recovered POP_LATITUDE POP_LONGITUDE
## 1  0.0027505 0.038429   4234    150       460        33.58        -111.5
## 2  0.0030913 0.017520   7962    330      1528        33.58        -111.5
## 3  0.0014293 0.033751  13631    679      3357        33.58        -111.5
## 4  0.0013712 0.036006  20123    917      4869        33.58        -111.5
## 5  0.0006979 0.013973  39097   1219      6598        33.58        -111.5
## 6  0.0000000 0.003805  84092   1720      9715        33.58        -111.5
## 7  0.0023628 0.007923  28035    970      1753        37.82        -120.9
## 8  0.0013106 0.006909  52152   2131      5130        37.82        -120.9
## 9  0.0009157 0.012128  78704   3207      9098        37.82        -120.9
## 10 0.0006310 0.012709 115032   4219     17585        37.82        -120.9
##    PopulationEstimate2018 PopTotalMale2017 PopulationEstimate_above65_2017
## 1                  478110           232553                           80116
## 2                  478110           232553                           80116
## 3                  478110           232553                           80116
## 4                  478110           232553                           80116
## 5                  478110           232553                           80116
## 6                  478110           232553                           80116
## 7                  682018           338751                           94920
## 8                  682018           338751                           94920
## 9                  682018           338751                           94920
## 10                 682018           338751                           94920
##    PopulationDensityperSqMile2010 DiabetesPercentage Smokers_Percentage
## 1                           52.05             10.060              16.48
## 2                           52.05             10.060              16.48
## 3                           52.05             10.060              16.48
## 4                           52.05             10.060              16.48
## 5                           52.05             10.060              16.48
## 6                           52.05             10.060              16.48
## 7                          663.26              8.505              12.09
## 8                          663.26              8.505              12.09
## 9                          663.26              8.505              12.09
## 10                         663.26              8.505              12.09
##    HeartDiseaseMortality StrokeMortality Hospitals ICU_beds
## 1                  148.8           30.90     5.067    103.9
## 2                  148.8           30.90     5.067    103.9
## 3                  148.8           30.90     5.067    103.9
## 4                  148.8           30.90     5.067    103.9
## 5                  148.8           30.90     5.067    103.9
## 6                  148.8           30.90     5.067    103.9
## 7                  153.9           37.89     5.672    126.5
## 8                  153.9           37.89     5.672    126.5
## 9                  153.9           37.89     5.672    126.5
## 10                 153.9           37.89     5.672    126.5
##    HospParticipatinginNetwork2017 stay_at_home above_50_gatherings
## 1                           1.800           12                   2
## 2                           1.800           12                   2
## 3                           1.800           12                   2
## 4                           1.800           12                   2
## 5                           1.800           12                   2
## 6                           1.800           12                   2
## 7                           1.845            0                   4
## 8                           1.845            0                   4
## 9                           1.845            0                   4
## 10                          1.845            0                   4
##    above_500_gatherings restaurant_dine_in entertainment_gym days
## 1                     6                  7                 7   14
## 2                     6                  7                 7   28
## 3                     6                  7                 7   42
## 4                     6                  7                 7   56
## 5                     6                  7                 7   70
## 6                     6                  7                 7   84
## 7                     8                  3                 3   14
## 8                     8                  3                 3   28
## 9                     8                  3                 3   42
## 10                    8                  3                 3   56

Output Models

model_fit_out = data.frame()

Testing

write.csv(model_fit_out, file = "model_fit_out.csv")

Project Intro

Draft and Outline

  1. What we have achieved is a simple, interpretable, accessible model that anyone can get their hands on. It not only works towards assisting the prediction and prevention of pandemic, but also gives us clues on the relationship between variable social factors and the vulnerability of one place to the virus.

  2. This is our very first step to make our own contribution to the control of the pandemic. We believe that everyone can take a step to do their parts, and we need not be freaked out by the technological obstacles and the already-done achievements by the “professionals”.

  3. My partner and I are both undergraduates in non-CS majors, and this is our first time touching AWS or any other cloud service system. We started everything offline, accomplishing every tasks locally, and it did take us a while to figure out where to incorporate all those into AWS. But soon we see the enriched potentials and capability of AWS. Due to the limited time, we only took a slight bite of the AWS system, but we will be working to dig deeper.

  4. The next step to take is to integrate everything into an efficient cloud computing system so that larger data and more sophisticated models could be handled, especially in a well-automated manner.

  5. There are interested stuffs like the political factor. We may wanna see how it affects We have learned a lot from this experience and we are looking forwards to becoming professional data analytist…blabla

Appendix

Modeling Procedures

Data Points pairs() Plot

Modeling k

mod_k_1 = lm(k ~ days, data = data_unamed)
mod_k_2 = lm(k ~ . - k, data = data_unamed)
summary(mod_k_2)
## 
## Call:
## lm(formula = k ~ . - k, data = data_unamed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5410 -0.0388  0.0132  0.0491  0.2173 
## 
## Coefficients: (9 not defined because of singularities)
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      1.30e+01   3.49e+00    3.73  0.00052 ***
## sigma                                  NA         NA      NA       NA    
## lamda                           -5.83e+00   1.14e+00   -5.13  5.6e-06 ***
## c                               -2.86e-01   6.77e-02   -4.22  0.00011 ***
## alpha                            2.46e+00   9.56e-01    2.58  0.01317 *  
## omega                            3.12e+01   2.43e+01    1.29  0.20429    
## miu                              3.73e+00   1.91e+00    1.96  0.05664 .  
## cases                            1.72e-07   1.55e-06    0.11  0.91257    
## deaths                           1.29e-05   1.81e-05    0.71  0.47941    
## recovered                       -4.39e-06   5.96e-06   -0.74  0.46514    
## POP_LATITUDE                    -1.59e-01   4.49e-02   -3.54  0.00093 ***
## POP_LONGITUDE                    5.61e-02   1.50e-02    3.73  0.00052 ***
## PopulationEstimate2018          -8.87e-05   2.56e-05   -3.46  0.00118 ** 
## PopTotalMale2017                 1.68e-04   4.94e-05    3.41  0.00137 ** 
## PopulationEstimate_above65_2017  5.73e-05   1.57e-05    3.65  0.00067 ***
## PopulationDensityperSqMile2010  -6.50e-06   1.32e-04   -0.05  0.96088    
## DiabetesPercentage              -6.81e-01   1.80e-01   -3.79  0.00043 ***
## Smokers_Percentage               2.70e-01   7.69e-02    3.51  0.00101 ** 
## HeartDiseaseMortality           -2.32e-02   8.51e-03   -2.72  0.00907 ** 
## StrokeMortality                  1.33e-01   4.09e-02    3.25  0.00218 ** 
## Hospitals                              NA         NA      NA       NA    
## ICU_beds                               NA         NA      NA       NA    
## HospParticipatinginNetwork2017         NA         NA      NA       NA    
## stay_at_home                           NA         NA      NA       NA    
## above_50_gatherings                    NA         NA      NA       NA    
## above_500_gatherings                   NA         NA      NA       NA    
## restaurant_dine_in                     NA         NA      NA       NA    
## entertainment_gym                      NA         NA      NA       NA    
## days                             3.53e-03   1.35e-03    2.61  0.01204 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.135 on 46 degrees of freedom
## Multiple R-squared:  0.598,  Adjusted R-squared:  0.432 
## F-statistic: 3.61 on 19 and 46 DF,  p-value: 0.000191

Modeling sigma

# mod_sigma = 

Modeling lambda

# mod_lambda = 

Modeling c

# mod_c = 

Modeling alpha

# mod_alpha = 

Modeling omega

# mod_omega = 

Modeling miu

# mod_miu = 

Helper Functions

get_bp_decision = function(model, alpha) {
  decide = unname(bptest(model)$p.value < alpha)
  ifelse(decide, "Reject", "Fail to Reject")
}

get_bp_pval = function(model) {
  bptest(model)$p.value
}

get_sw_decision = function(model, alpha) {
  decide = unname(shapiro.test(resid(model))$p.value < alpha)
  ifelse(decide, "Reject", "Fail to Reject")
}

get_sw_pval = function(model) {
  shapiro.test(resid(model))$p.value
}

get_num_params = function(model) {
  length(coef(model))
}

get_loocv_rmse = function(model, is_log) {
  ifelse(
    is_log, 
    sqrt(mean(na.omit(((dat_trn$price - exp(fitted(model))) / (1 - hatvalues(model))) ^ 2))),
    sqrt(mean((resid(model) / (1 - hatvalues(model))) ^ 2))
  )
}

get_adj_r2 = function(model) {
  summary(model)$adj.r.squared
}

test_mod = function(model, is_log = FALSE){
  c(loocv_rmse = get_loocv_rmse(model, is_log), 
    adj_r2 = get_adj_r2(model), 
    bp_pval = get_bp_pval(model), 
    sw_pval = get_sw_pval(model), 
    num_params = get_num_params(model))
}

diagnostics = function(model, pcol = "grey", lcol = "dodgerblue", alpha = 0.05, plotit = TRUE, testit = TRUE){
  if (plotit){
    par(mfrow = c(1, 2), pty="s")
    
    plot(fitted(model), resid(model), col = "grey", pch = 20, 
         xlab = "Fitted", ylab = "Residual", 
         main = "Fitted versus Residuals")
    abline(h = 0, col = "darkorange", lwd = 2)
    
    qqnorm(resid(model), col = pcol)
    qqline(resid(model), col = lcol, lwd = 2)
  }
  if (testit){
    list(p_val = shapiro.test(resid(model))$p, 
         decision = ifelse(test = shapiro.test(resid(model))$p < alpha, 
                           yes = "Reject", no = "Fail to Reject"))
  }
}

get_test_rmse = function(model) {
  sqrt(mean((dat_tst$price - predict(model, newdata = dat_tst))^ 2))
}

get_perc_err = function(model) {
  actual = dat_tst$price
  predicted = predict(model, dat_tst)
  100 * mean((abs(actual - predicted)) / actual)
}